{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c8867b7-03bb-4da6-ae46-28b110b15c33",
   "metadata": {},
   "outputs": [],
   "source": [
    "## tags_comparisons.ipynb\n",
    "## author: Francesco Garassino"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c515ed9-da55-4a43-8474-83e5d1d80398",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports\n",
    "import glob\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "import re\n",
    "from datetime import date\n",
    "from thefuzz import fuzz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5533f67-c953-431e-b641-990ab48a54d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# functions\n",
    "# iterate over interview numbers\n",
    "def compare_tags(tags_tab):\n",
    "    # initialise a list to store comparison results\n",
    "    results = []\n",
    "    \n",
    "    for intw in set(tags_tab['interview']):\n",
    "       \n",
    "        # select tags from one interview, making sure no duplicate rows are carried along\n",
    "        intw_tags = tags_tab[tags_tab['interview'] == intw].drop_duplicates(subset='id')\n",
    "        \n",
    "        # convert dataframe to dictionary for faster processing\n",
    "        intw_tags_dict = intw_tags.set_index(['id', 'tagger', 'tag_nr']).to_dict(orient='index', index=True)\n",
    "    \n",
    "        # iterate over tags    \n",
    "        # notice that key1 and key2 in this case are tuples of two elements \n",
    "        # keyN[0] is the unique tagged sentence ID\n",
    "        # keyN[1] is the tagger name\n",
    "        \n",
    "        # set pattern to remove the SPEAKER labels\n",
    "        #pattern = r'\\sSPEAKER\\s\\d+\\s\\d{1,2}:\\d{2}:\\d{2}'\n",
    "    \n",
    "        # iterate over the first set of tags (i.e., those done by tagger1)\n",
    "        for key1, value1 in intw_tags_dict.items():\n",
    "            \n",
    "            # initialise a counter of matches\n",
    "            matches = 0\n",
    "\n",
    "            # make sure empty fields\n",
    "\n",
    "            \n",
    "            # clean up the tagged text from the SPEAKER labels\n",
    "            # NOT NEEDED anymore as this was implemented in tags_embedding_analysis\n",
    "            #taggedtext1 = re.sub(pattern, '', value1['content'])\n",
    "            taggedtext1 = value1['content']\n",
    "            \n",
    "            # iterate over the second set of tags (i.e., those done by tagger2)\n",
    "            for key2, value2 in intw_tags_dict.items():\n",
    "                \n",
    "                # if the keys are the same, don't bother \n",
    "                if key1[0] == key2[0]:\n",
    "                    #print('skipping...')\n",
    "                    continue\n",
    "                \n",
    "                # if the two sentences are tagged from the same person, continue   \n",
    "                elif key1[1] == key2[1]:\n",
    "                    #print('skipping...')\n",
    "                    continue\n",
    "                \n",
    "                # otherwise the two sentences are NOT tagged from the same person, so proceed to compare them \n",
    "                else:\n",
    "                    #taggedtext2 = re.sub(pattern, '', value2['content'])\n",
    "                    taggedtext2 = value2['content']\n",
    "                    score = fuzz.partial_ratio(taggedtext1, taggedtext2)\n",
    "    \n",
    "                    # a fuzz.partial_ratio >= 95 means that parts of the two tags mostly correspond - which is what we want\n",
    "                    if score >= 95:\n",
    "                        matches += 1\n",
    "                        \n",
    "                        # row = ID1, ID2, interview_n, cluster, tagger1, tagger2, category1, category2, sentence1, sentence2\n",
    "                        row = [key1[0], key2[0], value1['interview'], value1['cluster'], key1[1], key2[1], value1['tag'], value2['tag'], taggedtext1, taggedtext2]\n",
    "                        \n",
    "                        # make sure we're not appending double hits\n",
    "                        if any(sublist[:2] == [row[1], row[0]]  for sublist in results):\n",
    "                            continue\n",
    "                        else:\n",
    "                            results.append(row)\n",
    "            \n",
    "            # if the sentence did not match anything tagged from the other tagger, add it to the table as a single entry\n",
    "            # so all \"...2\" fields will be left empty\n",
    "            if matches == 0:\n",
    "                row = [key1[0], '', value1['interview'], value1['cluster'], key1[1], '', value1['tag'], '', taggedtext1, '']\n",
    "                results.append(row)\n",
    "        \n",
    "        # print status message\n",
    "        #print(f'-- Comparisons done for {intw} --')\n",
    "    \n",
    "    # print completion message\n",
    "    print('\\n\\t-- All comparisons done for all interviews')\n",
    "    return results\n",
    "\n",
    "# define a function to split a string into two chunks\n",
    "def split_string(string, pattern):\n",
    "    return pd.Series(string.split(pattern, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef4a7efc-44c1-406d-9ad0-067796b122ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "# retrieve the paths of tables created by tags_embedding_analysis.ipynb\n",
    "clust_tags_paths = glob.glob('./outputs/*clustered.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e293e733-e9af-4422-8d92-826d235b6366",
   "metadata": {},
   "outputs": [],
   "source": [
    "for path in clust_tags_paths:\n",
    "\n",
    "    category = path.split('/')[-1].split('_')[0]\n",
    "\n",
    "    # status message\n",
    "    print(f'Processing {path.split('/')[-1]}...')\n",
    "    \n",
    "    # read data in\n",
    "    tags_tab = pd.read_csv(path, index_col=0)\n",
    "\n",
    "    # split \"document\" column to separate interview number and tagger information\n",
    "    tags_tab[[\"interview\", \"tagger\"]] = tags_tab['document'].apply(split_string, pattern='_')\n",
    "    \n",
    "    # split \"tag\" column to separate tag number and name\n",
    "    tags_tab[['tag_nr', 'tag_name']] = tags_tab['tag'].apply(split_string, pattern='.')\n",
    "\n",
    "    # compare tags and group those that are similar\n",
    "    results = compare_tags(tags_tab)\n",
    "\n",
    "    # export results to CSV dataframe\n",
    "    results_df = pd.DataFrame(results, columns=['id1', 'id2', 'interview_n', 'cluster', 'tagger1', 'tagger2', 'category1', 'category2', 'sentence1', 'sentence2'])\n",
    "    csv_path = f'./outputs/{category.replace(' ', '-').replace('.', '')}_tags_clustered_comparison.csv'\n",
    "    results_df.to_csv(csv_path, index=False)\n",
    "    \n",
    "    print(f'\\n\\t-- Wrote results to {csv_path}\\n')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
